Introduction

This notebook presents the exploratory data analysis (EDA) for the Kaggle Playground Series - Season 5 Episode 2 (S5E2), which aims to predict backpack prices based on product attributes.

Load Data

# Read the training and test datasets
train <- read.csv("train.csv")
test <- read.csv("test.csv")

Dataset Overview

# Check the structure of the training data
str(train)
## 'data.frame':    300000 obs. of  11 variables:
##  $ id                  : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Brand               : chr  "Jansport" "Jansport" "Under Armour" "Nike" ...
##  $ Material            : chr  "Leather" "Canvas" "Leather" "Nylon" ...
##  $ Size                : chr  "Medium" "Small" "Small" "Small" ...
##  $ Compartments        : num  7 10 2 8 1 10 3 1 8 2 ...
##  $ Laptop.Compartment  : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Waterproof          : chr  "No" "Yes" "No" "No" ...
##  $ Style               : chr  "Tote" "Messenger" "Messenger" "Messenger" ...
##  $ Color               : chr  "Black" "Green" "Red" "Green" ...
##  $ Weight.Capacity..kg.: num  11.6 27.1 16.6 12.9 17.7 ...
##  $ Price               : num  112.2 68.9 39.2 80.6 86 ...
# Summary statistics for the target variable
summary(train$Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   47.38   80.96   81.41  115.02  150.00
# Check for missing values in all columns
colSums(is.na(train))
##                   id                Brand             Material 
##                    0                    0                    0 
##                 Size         Compartments   Laptop.Compartment 
##                    0                    0                    0 
##           Waterproof                Style                Color 
##                    0                    0                    0 
## Weight.Capacity..kg.                Price 
##                  138                    0

Price Distribution

# Plot the distribution of backpack prices
hist(train$Price,
     breaks = 50,
     main = "Distribution of Backpack Prices",
     xlab = "Price")

# Log-transformed price distribution
hist(log1p(train$Price),
     breaks = 50,
     col = "gray",
     main = "Log-Transformed Distribution of Backpack Prices",
     xlab = "Log(1 + Price)")

Capacity vs. Price

# Sample a subset for clearer visualization
set.seed(42)
train_sample <- train[sample(nrow(train), 5000), ]

ggplot(train_sample, aes(x = `Weight.Capacity..kg.`, y = Price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  ggtitle("Weight Capacity vs. Price (Sampled 5k)")

# Numeric Features vs. Price

numeric_cols <- train %>% select(where(is.numeric))

# Calculate correlation with Price
correlations <- cor(numeric_cols, use = "complete.obs")["Price",]
correlations <- sort(correlations[-which(names(correlations) == "Price")], decreasing = TRUE)

# Visualize
cor_df <- data.frame(Feature = names(correlations), Correlation = correlations)
ggplot(cor_df, aes(x = Correlation, y = fct_reorder(Feature, Correlation))) +
  geom_col(fill = "lightblue") +
  labs(title = "Correlation of Numerical Features with Price",
       x = "Correlation with Price", y = "Feature") +
  theme_minimal()

# Categorical Feature: Brand Frequency

# Top 20 most frequent brands
brand_count <- train %>%
  group_by(Brand) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(20)

ggplot(brand_count, aes(x = reorder(Brand, -count), y = count)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 20 Most Frequent Brands", x = "Brand", y = "Count")

# Capacity vs. Price

# Sample a subset for clearer visualization
set.seed(42)
train_sample <- train[sample(nrow(train), 5000), ]

ggplot(train_sample, aes(x = Weight.Capacity..kg., y = Price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  ggtitle("Weight Capacity vs. Price (Sampled 5k)")

# Brand vs. Price

# Boxplot: Brand vs. Price
ggplot(train, aes(x = Brand, y = Price)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle("Brand vs. Price")

Material vs. Price

# Boxplot: Material vs. Price
ggplot(train, aes(x = Material, y = Price)) +
  geom_boxplot() +
  ggtitle("Material vs. Price")

Laptop Compartment vs. Price

# Boxplot: Laptop.Compartment vs. Price
ggplot(train, aes(x = Laptop.Compartment, y = Price)) +
  geom_boxplot() +
  ggtitle("Laptop Compartment vs. Price")

Interaction Analysis(Brand × Material,Material × Laptop Compartment )

train$Brand_Material <- paste(train$Brand, train$Material, sep = "_")
top_combo <- train %>% 
  group_by(Brand_Material) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

ggplot(head(top_combo, 20), aes(x = reorder(Brand_Material, count), y = count)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  coord_flip() +
  labs(title = "Top 20 Brand × Material Combinations", x = "Brand_Material", y = "Count")

set.seed(42)
sampled <- train %>% sample_n(5000)
ggplot(sampled, aes(x = Weight.Capacity..kg., y = Price, color = Brand)) +
  geom_point(alpha = 0.5) +
  theme(legend.position = "none") +
  labs(title = "Weight Capacity vs Price Colored by Brand")

train$Brand_WeightCombo <- paste(train$Brand, round(train$Weight.Capacity..kg.), sep = "_")
train$Material_Laptop <- paste(train$Material, train$Laptop.Compartment, sep = "_")

material_laptop_price <- train %>%
  group_by(Material_Laptop) %>%
  summarise(mean_price = mean(Price, na.rm = TRUE)) %>%
  arrange(desc(mean_price))

ggplot(head(material_laptop_price, 20), aes(x = reorder(Material_Laptop, mean_price), y = mean_price)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() +
  labs(title = "Top 20 Material × LaptopFeature Importance Avg Prices", x = "Material × Laptop Compartment", y = "Average Price")

# Compartment importance within same brand
sub_train <- train %>% filter(Brand == "Jansport")

ggplot(sub_train, aes(x = Weight.Capacity..kg., y = Price)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Within Jansport: Weight Capacity vs Price")

ggplot(sub_train, aes(x = Compartments, y = Price)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Within Jansport: Compartments vs Price")

the trend lines (blue and red) are almost horizontal with no apparent slope, the correlation is very weak.

Feature Importance

train_model <- train %>%
  select(Compartments, `Weight.Capacity..kg.`, Brand, Material, Size, Laptop.Compartment, Waterproof, Style, Color, Price) %>%
  na.omit() 

X <- model.matrix(Price ~ . -1, data = train_model)
y <- train_model$Price

# set para
dtrain <- lgb.Dataset(data = X, label = y)
params <- list(objective = "regression", metric = "rmse")
model <- lgb.train(params, dtrain, nrounds = 100)
## [LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
## [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045109 seconds.
## You can set `force_col_wise=true` to remove the overhead.
## [LightGBM] [Info] Total Bins 318
## [LightGBM] [Info] Number of data points in the train set: 299862, number of used features: 28
## [LightGBM] [Info] Start training from score 81.419736
importance <- lgb.importance(model)
print(importance)
##                   Feature       Gain       Cover  Frequency
##                    <char>      <num>       <num>      <num>
##  1:  Weight.Capacity..kg. 0.40490899 0.309545919 0.35433333
##  2:          Compartments 0.12232589 0.078358384 0.13633333
##  3:       MaterialLeather 0.03004754 0.056453124 0.02200000
##  4:         MaterialNylon 0.02739180 0.027698039 0.02533333
##  5:         WaterproofYes 0.02539792 0.018337336 0.02200000
##  6:          WaterproofNo 0.02308945 0.036453774 0.02466667
##  7:             ColorGray 0.02163621 0.027349327 0.02233333
##  8:  Laptop.CompartmentNo 0.02028806 0.010661375 0.02066667
##  9:             StyleTote 0.02016635 0.007396463 0.02366667
## 10:     BrandUnder_Armour 0.01954133 0.023618748 0.01933333
## 11:        StyleMessenger 0.01940521 0.006768977 0.02433333
## 12:            ColorGreen 0.01935373 0.056373812 0.02066667
## 13:             SizeLarge 0.01902882 0.031838092 0.02366667
## 14:           BrandAdidas 0.01819296 0.050529276 0.02066667
## 15:              ColorRed 0.01788407 0.016432286 0.01900000
## 16:            ColorBlack 0.01694886 0.048341202 0.01933333
## 17:             BrandPuma 0.01684930 0.005900360 0.01933333
## 18:             SizeSmall 0.01638647 0.019204197 0.01933333
## 19:             ColorPink 0.01633635 0.008003286 0.01833333
## 20:            SizeMedium 0.01589825 0.006400142 0.01500000
## 21:         BrandJansport 0.01541843 0.020633310 0.01833333
## 22: Laptop.CompartmentYes 0.01409706 0.015195558 0.01766667
## 23:        MaterialCanvas 0.01390546 0.029451614 0.01666667
## 24:             ColorBlue 0.01372862 0.042023751 0.01700000
## 25:     MaterialPolyester 0.01352541 0.022924345 0.01700000
## 26:                 Brand 0.01316909 0.014160785 0.01266667
## 27:         StyleBackpack 0.01273681 0.008751055 0.01700000
## 28:             BrandNike 0.01234155 0.001195463 0.01333333
##                   Feature       Gain       Cover  Frequency
lgb.plot.importance(importance, top_n = 20)

# Next Steps

Based on the exploratory analysis, the following steps are planned for the modeling phase:

  1. Feature Engineering:
    • Encode categorical variables such as Brand, Material, Size, Style using one-hot encoding or target encoding.
    • Convert binary variables (e.g., Waterproof, Laptop.Compartment) into logical/numeric format.
    • Consider interaction terms or grouped features if needed.
  2. Model Selection:
    • Start with baseline models such as Linear Regression and Decision Tree Regressor.
    • Move to ensemble methods like Random Forest and Gradient Boosting (e.g., XGBoost or LightGBM).
  3. Data Splitting and Evaluation:
    • Create a validation set or use cross-validation (e.g., 5-fold CV).
    • Evaluate models based on RMSE, as required by the competition metric.
  4. Hyperparameter Tuning:
    • Use grid search or randomized search to optimize key model parameters.
  5. Final Submission:
    • Generate predictions on the test.csv and submit to Kaggle.
    • Track performance on the public leaderboard and adjust strategies accordingly.

These steps will be iteratively refined based on validation results and model diagnostics.